In [4]:
# Analysis of the distribution of trees in Paris
# Import the dataset from a csv file
# Import pandas module to visualize and explore the data 
In [5]:
import pandas as pd
In [6]:
# Overview of the data and its column values
df = pd.read_csv(r'C:\Users\imane\OneDrive\Desktop\Data4good\p2-arbres-fr.csv', sep =";", encoding = "utf-8")
import missingno as msno
msno.bar(df)
Out[6]:
<AxesSubplot:>
In [7]:
# Delete irrelevant columns (not necessary for the analysis)
df.drop('id', axis = 1, inplace= True)
df.drop("complement_addresse",axis =1, inplace = True)
df.drop("numero",axis =1, inplace = True)
df.drop("id_emplacement",axis =1, inplace = True)
df.drop("espece",axis =1, inplace = True)
df.drop("variete",axis =1, inplace = True)
df.drop("remarquable",axis =1, inplace = True)
df.drop("type_emplacement",axis =1, inplace = True)
df.drop("libelle_francais",axis =1, inplace = True)
In [8]:
# Delete rows that represent trees whose girth is equal to 0 or more than 470 cm (it's not a common occurence in Paris)
# Delete rows that represent trees whose hight is equal to 0 or more than 35 m 
df.drop(df.index[(df["circonference_cm"] == 0)], axis = 0, inplace=True)
df.drop(df.index[(df["hauteur_m"] == 0)], axis = 0, inplace=True)
df.drop(df.index[(df["circonference_cm"] > 470)], axis = 0, inplace=True)
df.drop(df.index[(df["hauteur_m"] > 35 )], axis = 0, inplace=True)
df.dropna(subset=["circonference_cm"],inplace=True)
df.dropna(subset=["hauteur_m"],inplace=True)
df.dropna(subset=["stade_developpement"], inplace = True)
In [9]:
# Convert height values from metres to centimetres
df["hauteur_m"] = 100 * df["hauteur_m"]
In [10]:
# Modify the name of the column from m to cm
new_df = df.rename(columns={"hauteur_m":"hauteur_cm"})
# Visualize the new dataframe after cleanup
new_df
Out[10]:
domanialite arrondissement lieu genre circonference_cm hauteur_cm stade_developpement geo_point_2d_a geo_point_2d_b
1 Jardin PARIS 7E ARRDT MAIRIE DU 7E 116 RUE DE GRENELLE PARIS 7E Taxus 65 800 A 48.857656 2.321031
2 Jardin PARIS 7E ARRDT MAIRIE DU 7E 116 RUE DE GRENELLE PARIS 7E Taxus 90 1000 A 48.857705 2.321061
3 Jardin PARIS 7E ARRDT MAIRIE DU 7E 116 RUE DE GRENELLE PARIS 7E Acer 60 800 A 48.857722 2.321006
8 Jardin PARIS 16E ARRDT JARDIN DE L AVENUE FOCH / 10 AVENUE FOCH Sophora 145 1400 A 48.871990 2.275814
9 Jardin PARIS 16E ARRDT JARDIN DE L AVENUE FOCH / 10 AVENUE FOCH Sophora 135 1000 A 48.872046 2.275752
... ... ... ... ... ... ... ... ... ...
198859 DJS PARIS 19E ARRDT CENTRE SPORTIF JULES LADOUMEGUE / 35 ROUTE DES... Populus 20 500 J 48.890466 2.397443
198860 Jardin PARIS 14E ARRDT PARC MONTSOURIS Fagus 55 700 J 48.823919 2.337872
198861 Jardin PARIS 14E ARRDT PARC MONTSOURIS Taxus 55 500 JA 48.821099 2.338411
198862 Jardin PARIS 14E ARRDT PARC MONTSOURIS Taxus 75 500 JA 48.823552 2.337892
198865 DJS PARIS 13E ARRDT CENTRE SPORTIF GEORGES CARPENTIER / 81 BOULEVA... Acer 165 1100 A 48.819252 2.370641

129996 rows × 9 columns

In [11]:
# Boxplot to see the distribution of and outlying values of girth
new_df.boxplot(column=['circonference_cm'])
Out[11]:
<AxesSubplot:>
In [12]:
# boxplot to see the distribution of height values
new_df.boxplot(column=['hauteur_cm'])
Out[12]:
<AxesSubplot:>
In [13]:
# Heatmap after cleaning to see if there are any missing values left
msno.bar(new_df)
Out[13]:
<AxesSubplot:>
In [14]:
# Import ploting module and add a column that has the number of trees (it's initially set to 1 as each rows represents one tree)
import seaborn as sns
n_tree = [1 for i in range(len(new_df))]
new_df["n_tree"] = n_tree
In [15]:
# Rename the districts for cluster-free visualization of graphs
new_df["arrondissement"].replace({"PARIS 10E ARRDT":"10E ARR","PARIS 11E ARRDT":"11E ARR","PARIS 12E ARRDT":"12E ARR",
                                  "PARIS 13E ARRDT":"13E ARR","PARIS 14E ARRDT":"14E ARR","PARIS 15E ARRDT":"15E ARR",
                                  "PARIS 16E ARRDT":"16E ARR","PARIS 17E ARRDT":"17E ARR","PARIS 18E ARRDT":"18E ARR",
                                  "PARIS 19E ARRDT":"19E ARR","PARIS 1ER ARRDT":"1ER ARR","PARIS 20E ARRDT":"20E ARR",
                                  "PARIS 2E ARRDT":"2E ARR","PARIS 3E ARRDT":"3E ARR","PARIS 4E ARRDT":"4E ARR",
                                  "PARIS 5E ARRDT":"5E ARR","PARIS 6E ARRDT":"6E ARR","PARIS 7E ARRDT":"7E ARR",
                                  "PARIS 8E ARRDT":"8E ARR","PARIS 9E ARRDT":"9E ARR","SEINE-SAINT-DENIS":"S.S.DENIS",
                                  "VAL-DE-MARNE":"V.MARNE","BOIS DE BOULOGNE":"B. BOULOGNE","BOIS DE VINCENNES":"B.VINCENNES",
                                  "HAUTS-DE-SEINE":"H.SEINE"}, inplace = True)
In [16]:
# Transform the dataframe to comply with the next barplot 
new_df_2 = new_df.groupby("arrondissement").sum()
new_df_2.drop("circonference_cm",axis =1, inplace = True)
new_df_2.drop("hauteur_cm",axis =1, inplace = True)
new_df_2.drop("geo_point_2d_a",axis =1, inplace = True)
new_df_2.drop("geo_point_2d_b",axis =1, inplace = True)
In [17]:
# Plotting total trees per district
for_sns = pd.melt(new_df_2.reset_index(),id_vars=['arrondissement'],value_vars=new_df_2.columns)
p = sns.barplot(y="arrondissement",x="value", data = for_sns,hue = "variable")
p.set_title(" Nombre d'arbres par arroundissement")
Out[17]:
Text(0.5, 1.0, " Nombre d'arbres par arroundissement")
In [18]:
# Create a sub dataframe and keep only relevant columns for the plot
genre = new_df[["n_tree","genre"]]
df_map = genre.copy()
n_df = df_map.groupby(["genre"]).sum().reset_index()
highest_v = n_df.nlargest(8,"n_tree")
# Plot a barplot to show total trees per genre (considering only the 8 highest totals)
sns.barplot(data = highest_v, x= "genre", y="n_tree", order = highest_v.sort_values("n_tree",ascending = False).genre)
Out[18]:
<AxesSubplot:xlabel='genre', ylabel='n_tree'>
In [19]:
# Create a sub dataframe and keep only relevant columns for the plot
selected_col = new_df[["arrondissement","circonference_cm","hauteur_cm"]]
new_df_3 = selected_col.copy()
ndf_3 = new_df_3.groupby("arrondissement").mean()
for_sns_2 = ndf_3.reset_index()
# Scatterplot showing mean height and girth per district
g = sns.scatterplot(data = for_sns_2,x = "circonference_cm",y = "hauteur_cm",hue = "arrondissement")
g.set_title("hauteur_cm et circonference_cm moyenne par arrondissement")
Out[19]:
Text(0.5, 1.0, 'hauteur_cm et circonference_cm moyenne par arrondissement')
In [20]:
# Create a sub dataframe and keep only relevant columns for the plot
selected_col_2 = new_df[["stade_developpement","hauteur_cm","circonference_cm"]]
new_df_4 = selected_col_2.copy()
ndf_4 = new_df_4.groupby("stade_developpement").mean()
In [21]:
# Line plot showing the average height and girth of trees during different growth stages
for_sns_3 = ndf_4.reset_index()
import matplotlib.pyplot as plt
fig,ax = plt.subplots()
ax = sns.lineplot(data = for_sns_3, x= "stade_developpement",y="circonference_cm")
ax1 = sns.lineplot(data = for_sns_3,x = "stade_developpement", y = "hauteur_cm")
ax.set_title("Hauteur_cm et circonference_cm moyenne par stade de developpement")
Out[21]:
Text(0.5, 1.0, 'Hauteur_cm et circonference_cm moyenne par stade de developpement')
In [22]:
# Create a sub dataframe and keep only relevant columns for the plot
selected_col_3 = new_df[["stade_developpement","n_tree","arrondissement"]]
new_df_5 = selected_col_3.copy()
ndf_5 = new_df_5.groupby(["arrondissement","stade_developpement"]).sum().reset_index()
# Stacked bar plot showing total trees per district highlighted by growth stages
graph = sns.histplot(data = ndf_5,x= "arrondissement", weights= "n_tree", hue = "stade_developpement", multiple="stack")
graph.set_title("Nombre d'arbres par arrondissement et stade de developpement")
Out[22]:
Text(0.5, 1.0, "Nombre d'arbres par arrondissement et stade de developpement")
In [23]:
# Create a sub dataframe and keep only relevant columns for the plot
selected_col_4 = new_df[["n_tree","domanialite","arrondissement"]]
new_df_6 = selected_col_4.copy()
ndf_6 = new_df_6.groupby(["domanialite","arrondissement"]).sum().reset_index()
# Treemap showing the number of trees per district and domaniality
import plotly.express as px
figure = px.treemap(ndf_6,path = ["domanialite","arrondissement"], values="n_tree")
figure.show()
In [24]:
# Create a sub dataframe and keep only relevant columns for the plot
selected_col_5 = new_df[["arrondissement","n_tree","geo_point_2d_a","geo_point_2d_b"]]
new_df_7 = selected_col_5.copy()
ndf_7 = new_df_7.groupby("arrondissement").sum().reset_index()
ndf_7.drop('geo_point_2d_a', axis = 1, inplace= True)
ndf_7.drop('geo_point_2d_b', axis = 1, inplace= True)
new_df_8 = new_df_7.groupby(["arrondissement"]).nth(0).reset_index()
a = new_df_8["geo_point_2d_a"]
b = new_df_8["geo_point_2d_b"]
ndf_7 = ndf_7.join(a)
ndf_7 = ndf_7.join(b)
import folium
# Plot a map that shows total trees of disctrict (circles are proportionate to totals)
map = folium.Map(location=[48.856614, 2.3522219], zoom_start=14,control_scale=True,tiles="Stamen Terrain")
for i in range(0,len(ndf_7)):
    folium.Circle(
    location = [ndf_7.iloc[i]["geo_point_2d_a"], ndf_7.iloc[i]["geo_point_2d_b"]],
    tooltip = (ndf_7.iloc[i]["arrondissement"],ndf_7.iloc[i]["n_tree"]) ,
        radius = int(ndf_7.iloc[i]["n_tree"]) / len(ndf_7), fill = True,
         
    ).add_to(map)
map
Out[24]:
Make this Notebook Trusted to load map: File -> Trust Notebook
In [25]:
# Create a sub dataframe and keep only relevant columns for the plot
selected_col_6 = new_df[["arrondissement","geo_point_2d_a","geo_point_2d_b","domanialite"]]
new_df_8 = selected_col_6.copy()
# Map showing locations and domanialities of all the trees (color differs with districts)
scatter_map = px.scatter_mapbox(new_df_8,lat ="geo_point_2d_a",lon="geo_point_2d_b",hover_name="arrondissement", hover_data=["domanialite"],
                                zoom=14, height=500, color="arrondissement")
scatter_map.update_layout(mapbox_style="open-street-map")